#Importing libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
import os
import matplotlib.pyplot as plt#visualization
from PIL import Image
%matplotlib inline
import pandas as pd
import seaborn as sns#visualization
import itertools
import warnings
warnings.filterwarnings("ignore")
import io
import plotly.offline as py#visualization
py.init_notebook_mode(connected=True)#visualization
import plotly.graph_objs as go#visualization
import plotly.tools as tls#visualization
import plotly.figure_factory as ff#visualization
import plotly.express as px
start_time = pd.datetime.now()
data = pd.read_csv("train.csv")
#first few rows
data.head()
print ("Rows : " ,data.shape[0])
print ("Columns : " ,data.shape[1])
print ("\nFeatures : \n" ,data.columns.tolist())
print ("\nMissing values : ", data.isnull().sum().values.sum())
print ("\nUnique values : \n",data.nunique())
##missing value percentage
percent_missing = data.isnull().sum() * 100 / len(data)
missing_value_df = pd.DataFrame({'column_name': data.columns,
'percent_missing': percent_missing})
missing_value_df
#data type for each columns
data.dtypes
#data summary for numeric fields
data.describe()
data['id'].nunique()
#define a function to plot interactive distrbution graph
def distribution_plot(dataset,column,title,xtitle,ytitle):
trace = go.Histogram(x=dataset[column], opacity=0.7, marker={"line": {"color": "#25232C"}})
layout = go.Layout(title=title, xaxis={"title": xtitle, "showgrid": False},
yaxis={"title": ytitle, "showgrid": False},plot_bgcolor='rgba(0,0,0,0)',
paper_bgcolor='rgba(0,0,0,0)') #showgrid:False to remove gridline
figure = {"data": [trace], "layout": layout}
py.iplot(figure)
distribution_plot(data,'log_price',f"Log Price Distribution","log_price","Count")
data['property_type'].value_counts()
#property type distribution
distribution_plot(data,'property_type',f"Property Type Distribution","Property Type", "Count")
#room type distribution
distribution_plot(data,'room_type',f"Room Type Distribution","Room Type", "Count")
#accomodates
distribution_plot(data,'accommodates',f"Accommodates Distribution","Accommodates", "Count")
#bathrooms
distribution_plot(data,'bathrooms',f"Bathroom Distribution","Bathrooms", "Count")
#bed type
distribution_plot(data,'bed_type',f"Bed Type Distribution","Bed Type", "Count")
#cancellation_policy
distribution_plot(data,'cancellation_policy',f"Cancellation Policy Distribution","Cancellation Policy", "Count")
#cleaning_fee
distribution_plot(data,'cleaning_fee',f"Cleaning Fee Distribution","Cleaning Fee", "Count")
#city
distribution_plot(data,'city',f"City Distribution","City", "Count")
#host_has_profile_pic
distribution_plot(data,'host_has_profile_pic',f"Host Profile Pic Distribution","Host Profile(Yes/No)", "Count")
#host_identity_verified
distribution_plot(data,'host_identity_verified',f"Host Indentity Disrtibution Distribution","Host Identification(Yes/No)", "Count")
#host_response_rate
distribution_plot(data,'host_response_rate',f"Host Response Rate Distribution","Host Response Rate", "Count")
#host_has_profile_pic
distribution_plot(data,'instant_bookable',f"Instant Bookable Distribution","Instant Bookable(Yes/No)", "Count")
#number_of_reviews
distribution_plot(data,'number_of_reviews',f"Number of Review Distribution","Number of Review", "Count")
#review_scores_rating
distribution_plot(data,'review_scores_rating',f"Review Rating Distribution","Review Rating", "Count")
#the median of the review rating is 96
data['review_scores_rating'].median()
#bedrooms
distribution_plot(data,'bedrooms',f"Number of Bedroom Distribution","Number of Bedroom", "Count")
#beds
distribution_plot(data,'beds',f"Number of Bed Distribution","Number of Bed", "Count")
#convert the column to datetime data type
data['host_since'] = pd.to_datetime(data['host_since'], infer_datetime_format=True)
host_since = data['host_since'].value_counts().reset_index()
host_since = host_since.sort_values('index')
host_since.head()
fig = px.line(host_since, x='index', y='host_since')
fig.update_layout(title='Year Host Started Distribution', xaxis={"title": 'Year', "showgrid": False},
yaxis={"title": 'Count', "showgrid": False},plot_bgcolor='rgba(0,0,0,0)',
paper_bgcolor='rgba(0,0,0,0)')
fig.show()
#top 5 names
data['name'].value_counts().head()
#top 5 neighborhood
data['neighbourhood'].value_counts().head()
#host_identity_verified
distribution_plot(data,'instant_bookable',f"Host Indentity Disrtibution Distribution","Host Identification(Yes/No)", "Count")
#host_response_rate
distribution_plot(data,'host_response_rate',f"Host Response Rate Distribution","Host Response Rate", "Count")
data['amenities'].value_counts()
data['cancellation_policy'].value_counts()
#price distribution on US map
fig = px.scatter_mapbox(data, lat="latitude", lon="longitude", hover_data=["log_price"],color="log_price",
color_discrete_sequence=["fuchsia"], zoom=3, height=300)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
#number of bed distribution on US map
fig = px.scatter_mapbox(data, lat="latitude", lon="longitude", hover_data=["property_type"],color="beds",
color_discrete_sequence=["fuchsia"], zoom=4, height=300)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
#rating disrtbuition on US map
fig = px.scatter_mapbox(data, lat="latitude", lon="longitude", hover_data=["property_type"],color="review_scores_rating",
color_discrete_sequence=["fuchsia"], zoom=4, height=300)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
Amen = pd.DataFrame(data['amenities'])
Amen.head()
#remove the signs
Amen["amenities"] = Amen["amenities"].str.replace("{", "")
Amen["amenities"] = Amen["amenities"].str.replace("}", "")
Amen["amenities"] = Amen["amenities"].str.replace("\"", "")
#turn the description to list by splitting them using ","
Amen["amenities"] = Amen["amenities"].str.split(pat = ",")
Amen.head()
#Vectorize the item in the list by count
from sklearn.feature_extraction.text import CountVectorizer
inp = ["<some_space>".join(x) for x in Amen["amenities"]]
vectorizer = CountVectorizer(tokenizer = lambda x: x.split("<some_space>"), analyzer="word")
vector = vectorizer.fit_transform(inp)
print(vectorizer.get_feature_names())
Amen_df = pd.DataFrame(vector.toarray(), columns=vectorizer.get_feature_names())
Amen_df.head()
count_list = vector.toarray().sum(axis=0)
amen_dict = dict(zip(vectorizer.get_feature_names(),count_list))
del amen_dict[''] #remove empty value
amen_dict
amen_dict = sorted(amen_dict.items(), key=lambda x: x[1])
amen_dict
#generate word cloud
from PIL import Image
from wordcloud import WordCloud
wc = WordCloud(background_color="white",width=1000,height=1000, max_words=10,relative_scaling=0.5,normalize_plurals=False).generate_from_frequencies(amen_dict)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()